In [ ]:
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Modern CNN Architecture - Lab 2

In this lab, you will attempt to find an improvement on a mini-ResNet for CIFAR-10.

Below is a composable "class" based version for building ResNet networks. Spend a few moments looking at the structure and get familiar.


In [ ]:
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D, BatchNormalization, ReLU
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Add

class ResNetV2(object):
    """ Construct a Residual Convolution Network Network V2 """
    # Meta-parameter: list of groups: number of filters and number of blocks
    groups = { 50 : [ (64, 3), (128, 4), (256, 6),  (512, 3) ],           # ResNet50
               101: [ (64, 3), (128, 4), (256, 23), (512, 3) ],           # ResNet101
               152: [ (64, 3), (128, 8), (256, 36), (512, 3) ]            # ResNet152
             }
    _model = None
    init_weights = 'he_normal'

    def __init__(self, n_layers, input_shape=(224, 224, 3), n_classes=1000):
        """ Construct a Residual Convolutional Neural Network V2
            n_layers   : number of layers
            input_shape: input shape
            n_classes  : number of output classes
        """
        if n_layers not in [50, 101, 152]:
            raise Exception("ResNet: Invalid value for n_layers")

        # The input tensor
        inputs = Input(input_shape)

        # The stem convolutional group
        x = self.stem(inputs)

        # The learner
        x = self.learner(x, self.groups[n_layers])

        # The classifier for 1000 classes
        outputs = self.classifier(x, n_classes)

        # Instantiate the Model
        self._model = Model(inputs, outputs)

    @property
    def model(self):
        return self._model

    @model.setter
    def model(self, _model):
        self._model = _model

    def stem(self, inputs):
        """ Construct the Stem Convolutional Group 
            inputs : the input vector
        """
        # The 224x224 images are zero padded (black - no signal) to be 230x230 images prior to the first convolution
        x = ZeroPadding2D(padding=(3, 3))(inputs)
    
        # First Convolutional layer uses large (coarse) filter
        x = Conv2D(64, (7, 7), strides=(2, 2), padding='valid', use_bias=False, kernel_initializer=self.init_weights)(x)
        x = BatchNormalization()(x)
        x = ReLU()(x)
    
        # Pooled feature maps will be reduced by 75%
        x = ZeroPadding2D(padding=(1, 1))(x)
        x = MaxPooling2D((3, 3), strides=(2, 2))(x)
        return x

    def learner(self, x, groups):
        """ Construct the Learner
            x     : input to the learner
            groups: list of groups: number of filters and blocks
        """
        # First Residual Block Group (not strided)
        n_filters, n_blocks = groups.pop(0)
        x = ResNetV2.group(x, n_filters, n_blocks, strides=(1, 1))

        # Remaining Residual Block Groups (strided)
        for n_filters, n_blocks in groups:
            x = ResNetV2.group(x, n_filters, n_blocks)
        return x
    
    @staticmethod
    def group(x, n_filters, n_blocks, strides=(2, 2), init_weights=None):
        """ Construct a Residual Group
            x         : input into the group
            n_filters : number of filters for the group
            n_blocks  : number of residual blocks with identity link
            strides   : whether the projection block is a strided convolution
        """
        # Double the size of filters to fit the first Residual Group
        x = ResNetV2.projection_block(x, n_filters, strides=strides, init_weights=init_weights)

        # Identity residual blocks
        for _ in range(n_blocks):
            x = ResNetV2.identity_block(x, n_filters, init_weights=init_weights)
        return x

    @staticmethod
    def identity_block(x, n_filters, init_weights=None):
        """ Construct a Bottleneck Residual Block with Identity Link
            x        : input into the block
            n_filters: number of filters
        """
        if init_weights is None:
            init_weights = ResNetV2.init_weights
    
        # Save input vector (feature maps) for the identity link
        shortcut = x
    
        ## Construct the 1x1, 3x3, 1x1 convolution block
    
        # Dimensionality reduction
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Conv2D(n_filters, (1, 1), strides=(1, 1), use_bias=False, kernel_initializer=init_weights)(x)

        # Bottleneck layer
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Conv2D(n_filters, (3, 3), strides=(1, 1), padding="same", use_bias=False, kernel_initializer=init_weights)(x)

        # Dimensionality restoration - increase the number of output filters by 4X
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Conv2D(n_filters * 4, (1, 1), strides=(1, 1), use_bias=False, kernel_initializer=init_weights)(x)

        # Add the identity link (input) to the output of the residual block
        x = Add()([shortcut, x])
        return x

    @staticmethod
    def projection_block(x, n_filters, strides=(2,2), init_weights=None):
        """ Construct a Bottleneck Residual Block of Convolutions with Projection Shortcut
            Increase the number of filters by 4X
            x        : input into the block
            n_filters: number of filters
            strides  : whether the first convolution is strided
        """
        # Construct the projection shortcut
        # Increase filters by 4X to match shape when added to output of block
        shortcut = BatchNormalization()(x)
        shortcut = Conv2D(4 * n_filters, (1, 1), strides=strides, use_bias=False, kernel_initializer='he_normal')(shortcut)

        ## Construct the 1x1, 3x3, 1x1 convolution block
    
        # Dimensionality reduction
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Conv2D(n_filters, (1, 1), strides=(1,1), use_bias=False, kernel_initializer='he_normal')(x)

        # Bottleneck layer
        # Feature pooling when strides=(2, 2)
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Conv2D(n_filters, (3, 3), strides=strides, padding='same', use_bias=False, kernel_initializer='he_normal')(x)

        # Dimensionality restoration - increase the number of filters by 4X
        x = BatchNormalization()(x)
        x = ReLU()(x)
        x = Conv2D(4 * n_filters, (1, 1), strides=(1, 1), use_bias=False, kernel_initializer='he_normal')(x)

        # Add the projection shortcut to the output of the residual block
        x = Add()([x, shortcut])
        return x

    def classifier(self, x, n_classes):
        """ Construct the Classifier Group 
            x         : input to the classifier
            n_classes : number of output classes
        """
        # Pool at the end of all the convolutional residual blocks
        x = GlobalAveragePooling2D()(x)

        # Final Dense Outputting Layer for the outputs
        outputs = Dense(n_classes, activation='softmax', kernel_initializer=self.init_weights)(x)
        return outputs

# Example
# resnet = ResNetV2(50)

Starting mini-ResNet

Below is a mini-ResNet I wrote for CIFAR-10. Notice how at the bottleneck layer the feature maps are 3 x 3 (max pooling).

Model Summary

REMOVED for brevity ...
batch_normalization_783 (BatchN (None, 16, 16, 8)    32          add_259[0][0]                    
__________________________________________________________________________________________________
conv2d_789 (Conv2D)             (None, 8, 8, 1024)   262144      re_lu_782[0][0]                  
__________________________________________________________________________________________________
conv2d_786 (Conv2D)             (None, 8, 8, 1024)   8192        batch_normalization_783[0][0]    
__________________________________________________________________________________________________
add_260 (Add)                   (None, 8, 8, 1024)   0           conv2d_789[0][0]                 
                                                                 conv2d_786[0][0]                 
__________________________________________________________________________________________________
batch_normalization_787 (BatchN (None, 8, 8, 1024)   4096        add_260[0][0]                    
__________________________________________________________________________________________________
re_lu_783 (ReLU)                (None, 8, 8, 1024)   0           batch_normalization_787[0][0]    
__________________________________________________________________________________________________
conv2d_790 (Conv2D)             (None, 8, 8, 256)    262144      re_lu_783[0][0]                  
__________________________________________________________________________________________________
batch_normalization_788 (BatchN (None, 8, 8, 256)    1024        conv2d_790[0][0]                 
__________________________________________________________________________________________________
re_lu_784 (ReLU)                (None, 8, 8, 256)    0           batch_normalization_788[0][0]    
__________________________________________________________________________________________________
conv2d_791 (Conv2D)             (None, 8, 8, 256)    589824      re_lu_784[0][0]                  
__________________________________________________________________________________________________
batch_normalization_789 (BatchN (None, 8, 8, 256)    1024        conv2d_791[0][0]                 
__________________________________________________________________________________________________
re_lu_785 (ReLU)                (None, 8, 8, 256)    0           batch_normalization_789[0][0]    
__________________________________________________________________________________________________
conv2d_792 (Conv2D)             (None, 8, 8, 1024)   262144      re_lu_785[0][0]                  
__________________________________________________________________________________________________
add_261 (Add)                   (None, 8, 8, 1024)   0           add_260[0][0]                    
                                                                 conv2d_792[0][0]                 
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 65536)        0           add_261[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 10)           655370      flatten_1[0][0]                  
==================================================================================================
Total params: 2,656,334
Trainable params: 2,648,998
Non-trainable params: 7,336

Training

Below is the results for training for 10 epochs.

Train on 45000 samples, validate on 5000 samples
Epoch 1/10
45000/45000 [==============================] - 1229s 27ms/sample - loss: 4.3040 - acc: 0.1834 - val_loss: 2.1594 - val_acc: 0.2208
Epoch 2/10
45000/45000 [==============================] - 1029s 23ms/sample - loss: 2.0595 - acc: 0.2479 - val_loss: 1.9784 - val_acc: 0.2804
Epoch 3/10
45000/45000 [==============================] - 1144s 25ms/sample - loss: 1.9655 - acc: 0.2876 - val_loss: 1.9719 - val_acc: 0.2832
Epoch 4/10
45000/45000 [==============================] - 1149s 25ms/sample - loss: 1.8521 - acc: 0.3316 - val_loss: 1.7835 - val_acc: 0.3534
Epoch 5/10
45000/45000 [==============================] - 1227s 27ms/sample - loss: 1.7317 - acc: 0.3791 - val_loss: 1.7436 - val_acc: 0.3712
Epoch 6/10
45000/45000 [==============================] - 1138s 25ms/sample - loss: 1.6158 - acc: 0.4204 - val_loss: 1.6352 - val_acc: 0.4106
Epoch 7/10
45000/45000 [==============================] - 1570s 35ms/sample - loss: 1.4964 - acc: 0.4667 - val_loss: 1.4699 - val_acc: 0.4772
Epoch 8/10
45000/45000 [==============================] - 1148s 26ms/sample - loss: 1.3796 - acc: 0.5071 - val_loss: 1.3872 - val_acc: 0.5066
Epoch 9/10
45000/45000 [==============================] - 1189s 26ms/sample - loss: 1.2626 - acc: 0.5513 - val_loss: 1.3557 - val_acc: 0.5160
Epoch 10/10
45000/45000 [==============================] - 1134s 25ms/sample - loss: 1.1348 - acc: 0.6020 - val_loss: 1.3638 - val_acc: 0.5306

Try to Improve

How could we improve this?

  1. Perhaps adding regularization (dropout) and replace Flatten with GlobalAverage2D?

  2. Perhaps reduce the number of filters and add another ResNet group?

  3. Perhaps andcraft a different configuration for the second group?

  4. Perhaps concatenate the output from the stem convolution to the output of the last group (but you will have to make the feature maps the same size)?

  5. Think of your own idea?

If this is a classroom, we will split into 4 teams and each team will use a different approach.


In [ ]:
# Make mini-ResNetV1 for CIFAR-10
from tensorflow.keras import Input, Model
from tensorflow.keras.layers import Conv2D, Flatten, Dense

# Stem
inputs = Input((32, 32, 3))
x = Conv2D(32, (3, 3), strides=1, padding='same', activation='relu')(inputs)

# Learner
# Residual group: 2 blocks, 128 filters
# Residual block with projection, 256 filters
# Residual block with identity, 256 filters
x = ResNetV2.group(x, 2, 128)
x = ResNetV2.projection_block(x, 256)
x = ResNetV2.identity_block(x, 256)

# Classifier
x = Flatten()(x)
outputs = Dense(10, activation='softmax')(x)
model = Model(inputs, outputs)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

In [ ]:
from tensorflow.keras.datasets import cifar10
import numpy as np
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = (x_train / 255.0).astype(np.float32)
x_test  = (x_test  / 255.0).astype(np.float32)

model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=1)

In [ ]: